# -*- coding: utf-8 -*-
"""YouTube Comments Violence Lexicon Test

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1lYaEIWw0-cePHiT6fqECnnwfug0r58HX

This code block reads in the YouTube comments txt, sorts it into csv columns, and cleans the comments (removes stopwords, lemmatizes, removes punctuation)data before writing it to the dataframe.

I have chosen to leave capitalizations in the dataset, as it seems to be an important factor in flagging for violence.

The existence of the uncleaned_comments.csv and violent_uncleaned_comments.csv were so that I am able to look at the comments by hand and look at the data qualitatively in addition to quantitatively.
"""

import numpy as np
from collections import Counter
import pandas as pd
import csv
import string
import spacy

# Load the English spaCy model
nlp = spacy.load('en_core_web_sm')

# Define the file paths
input_file = "VideoCommentsThreatCorpus.txt"
cleaned_output_file = "cleaned_comments.csv"
uncleaned_output_file = "uncleaned_comments.csv"
violent_uncleaned_output_file = "violent_uncleaned_comments.csv"  # New output file for violent uncleaned comments

# Initialize lists to store cleaned data and violent comments
video_list, comment_list, commenter_list, time_list, sentence_list, violence_list = [], [], [], [], [], []
violent_comments = []
violent_uncleaned_comments = []  # New list for violent uncleaned comments

# Initialize variables to keep track of the current video, comment, and commenter
current_video, current_comment, current_commenter = "", "", ""

# Read and process the input file
with open(input_file, 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip()
        if line.startswith("Video #"):
            current_video, current_comment, current_commenter, time_ago = line.split(', ')
            current_video = current_video.split('#')[1]  # Extract the numeric part
            current_comment = current_comment.split('#')[1]  # Extract the numeric part
            current_commenter = current_commenter.split('#')[1]  # Extract the numeric part
        elif line.startswith(("0\t", "1\t")):
            parts = line.split('\t')
            violence = int(parts[0])  # Convert the 0/1 to integer
            sentence = parts[1]  # Keep the original casing and stopwords
            video_list.append(current_video)
            comment_list.append(current_comment)
            commenter_list.append(current_commenter)
            time_list.append(time_ago)
            sentence_list.append(sentence)
            violence_list.append(violence)

            if violence == 1:  # Store violent comments for word frequency analysis
                violent_comments.append(sentence)
                violent_uncleaned_comments.append(sentence)  # Store violent uncleaned comments

# Create a DataFrame to store the cleaned data
cleaned_data = {
    "video": video_list,
    "comment": comment_list,
    "commenter": commenter_list,
    "time": time_list,
    "sentence": sentence_list,
    "violence": violence_list
}
cleaned_df = pd.DataFrame(cleaned_data)

# Preprocess the "sentence" column in cleaned DataFrame
cleaned_df["sentence"] = cleaned_df["sentence"].apply(lambda text: ' '.join([token.lemma_ for token in nlp(text) if not token.is_stop and token.is_alpha]))

# Remove punctuation and special characters
cleaned_df["sentence"] = cleaned_df["sentence"].apply(lambda text: text.translate(str.maketrans('', '', string.punctuation)))

# Write the cleaned DataFrame to a CSV file
cleaned_df.to_csv(cleaned_output_file, index=False, quoting=csv.QUOTE_MINIMAL)

print(f"Cleaned data written to {cleaned_output_file}")

# Create a DataFrame to store the uncleaned data
uncleaned_data = {
    "video": video_list,
    "comment": comment_list,
    "commenter": commenter_list,
    "time": time_list,
    "sentence": sentence_list,
    "violence": violence_list
}
uncleaned_df = pd.DataFrame(uncleaned_data)

# Write the uncleaned DataFrame to a CSV file
uncleaned_df.to_csv(uncleaned_output_file, index=False, quoting=csv.QUOTE_MINIMAL)

print(f"Uncleaned data written to {uncleaned_output_file}")

# Read in the CSV
df = pd.read_csv("uncleaned_comments.csv")

# Get counts of extreme and not extreme comments
extreme_count = df[df["violence"]==1].shape[0]
not_extreme_count = df[df["violence"]==0].shape[0]

# Print out counts
print(f"Number of Violent Comments: {extreme_count}")
print(f"Number of Non-Violent Comments: {not_extreme_count}")

# Create a DataFrame to store violent uncleaned comments
violent_uncleaned_data = {
    "comment": violent_uncleaned_comments
}
violent_uncleaned_df = pd.DataFrame(violent_uncleaned_data)

# Write the violent uncleaned DataFrame to a CSV file
violent_uncleaned_df.to_csv(violent_uncleaned_output_file, index=False, quoting=csv.QUOTE_MINIMAL)

print(f"Violent uncleaned data written to {violent_uncleaned_output_file}")

import csv

lexicon_terms = []
lexicon_weights = []

# Open CSV file
with open('violence_lexicon.csv', 'r') as f:
    # Read rows into lists
    reader = csv.reader(f)
    next(reader) # Skip header row

    for row in reader:
        term = row[0]
        weight = row[1]

        lexicon_terms.append(term)
        lexicon_weights.append(weight)

# Open output file to write lexicon
with open('lexicon_formatted.txt', 'w') as output:

    # Write each term and weight
    for term, weight in zip(lexicon_terms, lexicon_weights):
        line = f"{term},{weight}\n"
        output.write(line)

print("Lexicon formatted!")

# Tokenize and count the words and 2-word phrases in violent comments
all_text = ' '.join(violent_comments)
doc = nlp(all_text)

# Extract words and 2-word phrases
tokens = [token.text for token in doc if not token.is_stop and token.is_alpha]
phrases = [' '.join(tokens[i:i+2]) for i in range(len(tokens) - 1)]

# Count the frequency of each word and 2-word phrase
word_freq = Counter(tokens)
phrase_freq = Counter(phrases)

# Output the 20 most frequent words
most_common_words = word_freq.most_common(50)

# Output the 20 most frequent 2-word phrases
most_common_phrases = phrase_freq.most_common(20)

# Display the results
print("\n20 Most Frequent Words in Violent Comments:")
for word, frequency in most_common_words:
    print(f"{word}: {frequency}")

print("\n20 Most Frequent 2-Word Phrases in Violent Comments:")
for phrase, frequency in most_common_phrases:
    print(f"{phrase}: {frequency}")

violence_lexicon = pd.read_csv("violence_lexicon.csv")
lexicon_words = set(violence_lexicon["term"].tolist())

# Load uncleaned comments
violent_uncleaned_df = pd.read_csv("violent_uncleaned_comments.csv")
violent_uncleaned_comments = violent_uncleaned_df["comment"].tolist()

# Process comments
violent_uncleaned_text = " ".join(violent_uncleaned_comments)
violence_doc = nlp(violent_uncleaned_text)
violent_words = set([token.text for token in violence_doc if not token.is_stop and token.is_alpha])

# Find matches
matched_words = lexicon_words & violent_words

# Print statistics
num_lexicon_words = len(lexicon_words)
num_matched = len(matched_words)
percentage = num_matched / num_lexicon_words * 100

print(f"Number of lexicon words: {num_lexicon_words}")
print(f"Number matched in violent comments: {num_matched}")
print(f"Percentage matched: {percentage:.2f}%")

from scipy import stats
import random

# Generate null distribution
null_counts = []
for i in range(1000):
    random_words = random.sample(violent_words, len(lexicon_words))
    null_matches = len(lexicon_words & set(random_words))
    null_counts.append(null_matches)

# Compare real matches
num_matched = len(matched_words)
p_value = (len([c for c in null_counts if c >= num_matched]) + 1) / (len(null_counts) + 1)

print(f"Observed matches: {num_matched}")
print(f"Mean null matches: {np.mean(null_counts):.1f}")
print(f"p-value: {p_value:.3f}")

"""This bit of code takes in the cleaned YouTube comments and outputs a csv file containing a vocabulary list pulled from it based on various feature extraction parameters. This is used to extract terms from the violence-flagged comments to enhance the existing lexicon."""

from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Load the preprocessed dataset
dataset_file = "violent_uncleaned_comments.csv"
df = pd.read_csv(dataset_file)

# Drop rows with missing values in the "sentence" column
df.dropna(subset=["comment"], inplace=True)

# Define custom stop words
custom_stop_words = ["stop", "like"]

# Define TF-IDF vectorizer parameters
max_features = 1000
min_df = 2
max_df = 0.95
ngram_range = (1, 2)

# Get the built-in English stop words from TfidfVectorizer
stop_words = list("english")

# Combine custom stop words with built-in stop words
stop_words += custom_stop_words

# Initialize the TF-IDF vectorizer with combined stop words
tfidf_vectorizer = TfidfVectorizer(
    max_features=max_features,
    min_df=min_df,
    max_df=max_df,
    ngram_range=ngram_range,
    stop_words=stop_words
)

# Fit and transform the data using the vectorizer
tfidf_matrix = tfidf_vectorizer.fit_transform(df["comment"])

# Get the number of features (vocabulary size)
num_features = len(tfidf_vectorizer.get_feature_names_out())

# Print the number of features
print(f"Number of features (vocabulary size): {num_features}")

# Get the vocabulary as a list
vocabulary = tfidf_vectorizer.get_feature_names_out()

# Create a DataFrame for the vocabulary
vocabulary_df = pd.DataFrame(vocabulary, columns=["Word"])

# Save the vocabulary to a new CSV file
vocabulary_file = "vocabulary.csv"
vocabulary_df.to_csv(vocabulary_file, index=False)

print(f"Vocabulary saved to {vocabulary_file}")

import pandas as pd

# Load violence lexicon and vocabulary CSV files
violence_lexicon_file = "violence_lexicon.csv"
vocabulary_file = "vocabulary.csv"

violence_lexicon = pd.read_csv(violence_lexicon_file)
vocabulary = pd.read_csv(vocabulary_file)

# Extract terms from the violence lexicon and vocabulary
lexicon_terms = set(violence_lexicon["term"])
vocabulary_words = set(vocabulary["Word"])

# Find common words between the violence lexicon and vocabulary
common_words = lexicon_terms.intersection(vocabulary_words)

# Calculate the percentage of words in the violence lexicon that are in the vocabulary
percentage_common_words = len(common_words) / len(lexicon_terms) * 100

# Display the results
print(f"Number of words in violence lexicon: {len(lexicon_terms)}")
print(f"Number of words in vocabulary: {len(vocabulary_words)}")
print(f"Number of common words: {len(common_words)}")
print(f"Percentage of words in violence lexicon present in vocabulary: {percentage_common_words:.2f}%")

import numpy as np
from scipy.stats import chi2

def fleiss_kappa(table):
    """
    Calculate Fleiss' kappa for inter-rater reliability

    :param table: a matrix with raters as rows and samples as columns
    :returns: Fleiss' kappa score
    """

    n_raters = table.shape[0]
    n_samples = table.shape[1]

    categories = np.unique(table)
    n_cat = len(categories)

    # Calculate P(j)
    p = {}
    for c in categories:
        p[c] = np.sum(table == c) / (n_raters * n_samples)

    P = np.sum(v**2 for v in p.values())

    # Calculate P(bar)
    P_bar = 0
    for j in categories:
        p_j = p[j]
        P_bar += p_j * (p_j - 1)
    P_bar = (1 / (n_raters - 1)) * P_bar

    kappa = (P_bar - P) / (1 - P)

    return kappa

# Sample data
data = np.array([[3, 2, 1, 2, 2, 3, 3, 3, 1, 3, 2, 1, 1, 2, 2, 1, 3, 3, 2, 2, 3, 1, 3, 1, 3, 3, 3, 3, 3, 1], # control
                 [3, 2, 1, 2, 2, 2, 2, 3, 1, 3, 2, 1, 1, 3, 1, 1, 2, 2, 1, 2, 2, 1, 1, 1, 3, 2, 3, 2, 1, 1], # rater A
                 [3, 2, 1, 2, 3, 3, 3, 3, 1, 3, 2, 1, 1, 2, 2, 1, 3, 3, 2, 2, 3, 1, 2, 1, 3, 3, 3, 3, 3, 1], # rater C
                 [3, 2, 1, 2, 2, 1, 3, 2, 1, 3, 2, 1, 1, 3, 2, 1, 1, 2, 1, 2, 3, 1, 1, 2, 3, 2, 3, 1, 2, 1]]) # rater E

# Extract number of samples, categories and raters
n_raters, n_samples = data.shape
categories = np.unique(data)
n_cat = len(categories)

# Calculate Fleiss' kappa
kappa = fleiss_kappa(data)

# Calculate chi-squared value
X2 = n_samples * (n_raters - 1) * kappa

# Calculate p-value
p_value = 1 - chi2.cdf(X2, (n_cat - 1) * (n_raters - 1))

print(f'Fleiss Kappa = {kappa:.3f}')
print(f'p-value = {p_value:.3f}')

# Number of ratings
num_ratings = data.size

# Compute count of agreed ratings
agree_count = np.sum(data[1:] == data[0])

# Overall simple agreement
agreement =  agree_count / num_ratings * 100

print(f'Percentage agreement: {agreement:.2f}%')

"""Code for testing the initial Violence Lexicon against the cleaned YouTube comments."""

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
import xgboost as xgb
import random
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import AdaBoostClassifier

random_state = random.randint(0, 100)

# Load csv files
violence_lexicon_df = pd.read_csv("violence_lexicon.csv")
cleaned_comments_df = pd.read_csv("cleaned_comments.csv")

# Map 0 to non-violent and 1 to violent
cleaned_comments_df["violence"] = cleaned_comments_df["violence"].map({0: 0, 1: 1})

X = cleaned_comments_df["comment"]
y = cleaned_comments_df["violence"]

# Convert X and y to 2D arrays
X = X.values.reshape(-1,1)
y = y.values.reshape(-1,1)

ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X, y)

# Split the dataset into 80% training and 20% testing
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

"""Code for tuning the hyperparameters for the boosting classifiers - the XGBoost one only took 7 or 8 minutes to run, but I killed the Gradient Boosting one after it continued to run for over 40 minutes. When I have more time to let it run I will probably do so in order to get the most accurate parameters."""

# Hyperparameter tuning for XGBoost
param_grid_xgb = {
    'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [3, 4, 5, 6],
    'min_child_weight': [1, 2, 3, 4]
}

xgb_classifier = xgb.XGBClassifier(random_state=42)

grid_search_xgb = GridSearchCV(xgb_classifier, param_grid_xgb, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_xgb.fit(X_train, y_train)

best_params_xgb = grid_search_xgb.best_params_
best_estimator_xgb = grid_search_xgb.best_estimator_

print("Best Hyperparameters for XGBoost:", best_params_xgb)

#y_test_pred_best_xgb = best_estimator_xgb.predict(X_test)

# Hyperparameter tuning for Gradient Boosting - THIS TAKES OVER 40 MINUTES, DO NOT RUN
param_grid_gb = {
    'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [3, 4, 5, 6],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

gb_classifier = GradientBoostingClassifier(random_state=42)

grid_search_gb = GridSearchCV(gb_classifier, param_grid_gb, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_gb.fit(X_train, y_train)

best_params_gb = grid_search_gb.best_params_
best_estimator_gb = grid_search_gb.best_estimator_

print("Best Hyperparameters for Gradient Boosting:", best_params_gb)

# Imports
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Random Forest Model
rf = RandomForestClassifier(random_state=42)

# Parameter Grid
rf_params = {'n_estimators': [100, 200, 500],
             'max_depth': [5, 8, 15],
             'min_samples_split': [2, 5, 10]}

# Cross validation split
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# GridSearch cross validation
rf_gridcv = GridSearchCV(estimator=rf, param_grid=rf_params, cv=inner_cv, scoring='f1')

# Evaluation metrics
scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': make_scorer(precision_score),
           'recall': make_scorer(recall_score),
           'f1': make_scorer(f1_score)}

# Nested Cross Validation
results = cross_validate(rf_gridcv, X_train, y_train,
                         cv=outer_cv, scoring=scoring)

# Print metrics
print("Accuracy:", results['test_accuracy'].mean())
print("Precision:", results['test_precision'].mean())
print("Recall:", results['test_recall'].mean())
print("F1 Score:", results['test_f1'].mean())

from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Get RF model from cross_validate
rf_model = rf_gridcv.estimator

# Refit on full train data
rf_model.fit(X_train, y_train)

# Generate test predictions
rf_test_pred = rf_model.predict(X_test)

# Confusion matrix
cm = confusion_matrix(y_test, rf_test_pred)

plt.figure()
sns.heatmap(cm, annot=True, fmt="d")
plt.title("Random Forest Confusion Matrix")

print(" Done")

"""Code for testing different classifiers - I comment/uncomment the models when relevant, and recorded the results below. I didn't end up using LGBM or CatBoost, but the code is still in here in case I want to test them out later on."""

# Initialize Classifiers
rf_classifier = RandomForestClassifier(random_state=42) # Random Forest
svm_classifier = SVC(random_state=42) # SVM
lr_classifier = LogisticRegression(random_state=42) # Logistic Regression
lsvm = LinearSVC(C=1.0, random_state=42) # Linear SVM
logit = LogisticRegression(penalty='l1', solver='saga', random_state=42) # Logistic Regression w/ L2

# Hyperparameters for XGBoost: {'learning_rate': 0.3, 'max_depth': 4, 'min_child_weight': 3, 'n_estimators': 200}

# Different models to test fit
#model = xgb.XGBClassifier(learning_rate = 0.3, max_depth = 4, min_child_weight = 3, n_estimators = 200, random_state = 42)
#model = rf_classifier.fit(X_train, y_train)
#model = lr_classifier.fit(X_train, y_train)
#model = svm_classifier.fit(X_train, y_train)
#model = lsvm.fit(X_train, y_train)
#model = logit.fit(X_train, y_train)
#model = AdaBoostClassifier(random_state=42)
model = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0)
model.fit(X_train, y_train)

# Function to apply the "violence_lexicon" to YouTube comments
def apply_lexicon(comment, lexicon_df):
    terms = lexicon_df["term"].tolist()
    weights = lexicon_df["weight"].tolist()
    score = 0

    comment = str(comment)  # Convert comment to a string

    for term, weight in zip(terms, weights):
        if term in comment:
            score += weight

    return score

# Apply the lexicon to training and testing sets:
X_train_scores = [apply_lexicon(comment, violence_lexicon_df) for comment in X_train]
X_test_scores = [apply_lexicon(comment, violence_lexicon_df) for comment in X_test]

# Classify comments as violent or non-violent based on the scores
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate accuracy and precision for both the training and testing sets:
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

#train_precision = precision_score(y_train, y_train_pred)
#test_precision = precision_score(y_test, y_test_pred)

train_precision, train_recall, _, _ = precision_recall_fscore_support(y_train, y_train_pred, average='binary')
test_precision, test_recall, _, _ = precision_recall_fscore_support(y_test, y_test_pred, average='binary')

print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)

print("Training Precision:", train_precision)
print("Testing Precision:", test_precision)

print("Training Recall:", train_recall)
print("Testing Recall:", test_recall)

# Replace accuracy calculations with
train_f1 = f1_score(y_train, y_train_pred)
test_f1 = f1_score(y_test, y_test_pred)

print("Training F1:", train_f1)
print("Testing F1:", test_f1)

import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Calculate confusion matrix
model_cm = confusion_matrix(y_test, y_test_pred)

# Plot confusion matrix
plt.figure()
sns.heatmap(model_cm, annot=True, fmt="d")
plt.title("Gradient Boost Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")

# Print other metrics
print("Accuracy:", test_accuracy)
print("Precision:", test_precision)
print("Recall:", test_recall)
print("F1 Score:", test_f1)

tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()

print("True Negatives:", tn)
print("False Positives:", fp)
print("False Negatives:", fn)
print("True Positives:", tp)

# Dictionary to store metrics for each model
model_metrics = {}

for model in [rf_classifier, svm_classifier, lr_classifier, lsvm, logit]:

    # Fit current model
    model.fit(X_train, y_train)

    # Generate predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Calculate metrics
    acc_train = accuracy_score(y_train, y_train_pred)
    acc_test = accuracy_score(y_test, y_test_pred)

    precision_train, recall_train, _, _ = precision_recall_fscore_support(y_train, y_train_pred, average='binary')
    precision_test, recall_test, _, _ = precision_recall_fscore_support(y_test, y_test_pred, average='binary')

    f1_train = f1_score(y_train, y_train_pred)
    f1_test = f1_score(y_test, y_test_pred)

    # Store into dictionary
    model_name = model.__class__.__name__
    model_metrics[model_name] = {
        "Acc_Train": acc_train,
        "Acc_Test": acc_test,
        "F1_Train": f1_train,
        "F1_Test": f1_test,
        "Precision_Train": precision_train,
        "Precision_Test": precision_test,
       }

# Convert dictionary to dataframe
metrics_df = pd.DataFrame(model_metrics).T

print(metrics_df)

# XGBoost
xgb_model = xgb.XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)

# AdaBoost
ada_model = AdaBoostClassifier(random_state=42)
ada_model.fit(X_train, y_train)
ada_pred = ada_model.predict(X_test)

# Gradient Boosting
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)
gb_pred = gb_model.predict(X_test)

# Metrics
xgb_acc = accuracy_score(y_test, xgb_pred)
ada_acc = accuracy_score(y_test, ada_pred)
gb_acc = accuracy_score(y_test, gb_pred)

# Add to dictionary
model_metrics['XGB'] = {'Acc_Test': xgb_acc}
model_metrics['AdaBoost'] = {'Acc_Test': ada_acc}
model_metrics['GBoost'] = {'Acc_Test': gb_acc}

# XGBoost
xgb_precision, xgb_recall, _, _ = precision_recall_fscore_support(y_test, xgb_pred, average='binary')
xgb_f1 = f1_score(y_test, xgb_pred)

print("XGBoost Testing Accuracy:", xgb_acc)
print("XGBoost Testing Precision:", xgb_precision)
print("XGBoost Testing Recall:", xgb_recall)
print("XGBoost Testing F1:", xgb_f1)

# AdaBoost
ada_precision, ada_recall, _, _ = precision_recall_fscore_support(y_test, ada_pred, average='binary')
ada_f1 = f1_score(y_test, ada_pred)

print("AdaBoost Testing Accuracy:", ada_acc)
print("AdaBoost Testing Precision:", ada_precision)
print("AdaBoost Testing Recall:", ada_recall)
print("AdaBoost Testing F1:", ada_f1)

# Gradient Boost
gb_precision, gb_recall, _, _ = precision_recall_fscore_support(y_test, gb_pred, average='binary')
gb_f1 = f1_score(y_test, gb_pred)

print("GBoost Testing Accuracy:", gb_acc)
print("GBoost Testing Precision:", gb_precision)
print("GBoost Testing Recall:", gb_recall)
print("GBoost Testing F1:", gb_f1)

import matplotlib.pyplot as plt

models = ['Random Forest', 'L2 Logistic Regression', 'Logistic Regression', 'Kernel SVM', 'Gradient Boost', 'XGBoost']
f1_scores = [0.86, 0.68, 0.67, 0.60, 0.65, 0.64]

fig, ax = plt.subplots()
ax.bar(models, f1_scores)

ax.set_ylabel('F1 Score')
ax.set_title('Model F1 Scores')
ax.set_xticklabels(models, rotation=30, ha='right')

plt.tight_layout()
plt.savefig('f1_scores.png')
print('Bar chart exported to f1_scores.png')

"""**XGBoost:**

Training Accuracy: 0.642229814946456
Testing Accuracy: 0.6326699073649454

Training Precision: 0.6385358185492926
Testing Precision: 0.6332853025936599

Training Recall: 0.6530246658398787
Testing Recall: 0.64102096627165

Training F1: 0.6456989735670814
Testing F1: 0.6371296547974994

**Gradient Boosting**

Training Accuracy: 0.6497282671008278
Testing Accuracy: 0.6460607172337889

Training Precision: 0.6486634932259245
Testing Precision: 0.6479796141244994

Training Recall: 0.6509576960176381
Testing Recall: 0.6490428441203282

Training F1: 0.6498085696600104
Testing F1: 0.6485107933327261

**AdaBoost**

Training Accuracy: 0.6117773854020959
Testing Accuracy: 0.6138677428230762

Training Precision: 0.6070780800707808
Testing Precision: 0.6118224872829329

Training Recall: 0.6303339304579486
Testing Recall: 0.6359161349134002

Training F1: 0.6184874707048855
Testing F1: 0.6236366887180403

**Random Forest**

Training Accuracy: 0.9604210140108693
Testing Accuracy: 0.9473539392827662

Training Precision: 0.9267981092705362
Testing Precision: 0.9059348652669863

Training Recall: 0.9996784713609848
Testing Recall: 0.9990884229717412

Training F1: 0.9618597251071728
Testing F1: 0.9502340905149992

**Support Vector Machine (SVM)**
Training Accuracy: 0.5992341030521222
Testing Accuracy: 0.6051545446207466

Training Precision: 0.6024234328785422
Testing Precision: 0.6123809523809524

Training Recall: 0.5800376647834274
Testing Recall: 0.5861440291704649

Training F1: 0.5910186506915031
Testing F1: 0.5989753143921751

**Logistic Regression**

Training Accuracy: 0.4992318099474879
Testing Accuracy: 0.5030725488397688

Training Precision: 0.4992318099474879
Testing Precision: 0.5030725488397688

Training Recall: 1.0
Testing Recall: 1.0

Training F1: 0.6659834811869073
Testing F1: 0.66939223822309

Code to produce visualizations for my findings. The first one was too messy and hard to read, and I didn't feel that the second one provided enough useful information to use. The third code block with the table and accuracy/F1 bar charts were used in the paper.
"""

import matplotlib.pyplot as plt
import numpy as np

# Define the models and their corresponding metrics
models = ['XGBoost', 'Gradient Boosting', 'AdaBoost', 'Random Forest', 'SVM', 'Logistic Regression']
training_accuracy = [0.6422, 0.6497, 0.6118, 0.9604, 0.5992, 0.4992]
testing_accuracy = [0.6327, 0.6461, 0.6139, 0.9474, 0.6052, 0.5031]

training_precision = [0.6385, 0.6487, 0.6071, 0.9268, 0.6024, 0.4992]
testing_precision = [0.6333, 0.6480, 0.6118, 0.9059, 0.6124, 0.5031]

training_recall = [0.6530, 0.6510, 0.6303, 0.9997, 0.5800, 1.0]
testing_recall = [0.6410, 0.6490, 0.6359, 0.9991, 0.5861, 1.0]

training_f1 = [0.6457, 0.6498, 0.6185, 0.9619, 0.5910, 0.6660]
testing_f1 = [0.6371, 0.6485, 0.6236, 0.9502, 0.5990, 0.6694]

# Set up the bar positions
bar_width = 0.35
index = np.arange(len(models))

# Plot the training metrics
plt.figure(figsize=(12, 8))
plt.bar(index, training_accuracy, width=bar_width, label='Training Accuracy')
plt.bar(index + bar_width, training_precision, width=bar_width, label='Training Precision')
plt.bar(index + 2 * bar_width, training_recall, width=bar_width, label='Training Recall')
plt.bar(index + 3 * bar_width, training_f1, width=bar_width, label='Training F1')

# Plot the testing metrics
plt.bar(index, testing_accuracy, width=bar_width, alpha=0.5, label='Testing Accuracy')
plt.bar(index + bar_width, testing_precision, width=bar_width, alpha=0.5, label='Testing Precision')
plt.bar(index + 2 * bar_width, testing_recall, width=bar_width, alpha=0.5, label='Testing Recall')
plt.bar(index + 3 * bar_width, testing_f1, width=bar_width, alpha=0.5, label='Testing F1')

# Customize the plot
plt.xlabel('Models', fontsize=14)
plt.ylabel('Metrics', fontsize=14)
plt.title('Model Performance Metrics', fontsize=16)
plt.xticks(index + 1.5 * bar_width, models)
plt.legend()
plt.ylim(0, 1.2)

# Show the plot
plt.show()

import matplotlib.pyplot as plt
import numpy as np

# Define the models and their corresponding F1 scores
models = ['XGBoost', 'Gradient Boosting', 'AdaBoost', 'Random Forest', 'SVM', 'Logistic Regression']
training_f1 = [0.6457, 0.6498, 0.6185, 0.9619, 0.5910, 0.6660]
testing_f1 = [0.6371, 0.6485, 0.6236, 0.9502, 0.5990, 0.6694]

# Set up the bar positions
bar_width = 0.35
index = np.arange(len(models))

# Plot the training and testing F1 scores
plt.figure(figsize=(10, 6))
plt.bar(index, training_f1, width=bar_width, label='Training F1')
plt.bar(index + bar_width, testing_f1, width=bar_width, alpha=0.5, label='Testing F1')

# Customize the plot
plt.xlabel('Models', fontsize=14)
plt.ylabel('F1 Score', fontsize=14)
plt.title('Model Performance (F1 Score)', fontsize=16)
plt.xticks(index + 0.5 * bar_width, models)
plt.legend()
plt.ylim(0, 1.2)

# Show the plot
plt.show()

import pandas as pd
import matplotlib.pyplot as plt

# Model metrics
models = ['XGBoost', 'Gradient Boosting','AdaBoost', 'Random Forest', 'SVM', 'Logistic Regression']
train_accuracy = [0.6422, 0.6497, 0.6117, 0.9604, 0.5992, 0.4992]
test_accuracy = [0.6326, 0.6460, 0.6138, 0.9473, 0.6051, 0.5030]
train_precision = [0.6385, 0.6486, 0.6070, 0.9267, 0.6024, 0.4992]
test_precision = [0.6332, 0.6479, 0.6118, 0.9059, 0.6123, 0.5030]
train_recall = [0.6530, 0.6505, 0.6303, 0.9996, 0.5800, 1.0]
test_recall = [0.6410, 0.6490, 0.6359, 0.9990, 0.5861, 1.0]
train_f1 = [0.6456, 0.6498, 0.6184, 0.9618, 0.5910, 0.6659]
test_f1 = [0.6371, 0.6485, 0.6236, 0.9502, 0.5989, 0.6693]

# Create dataframe
metrics = pd.DataFrame({
    'Model': models,
    'Train Accuracy': train_accuracy,
    'Test Accuracy': test_accuracy,
    'Train Precision': train_precision ,
    'Test Precision': test_precision,
    'Train Recall': train_recall,
    'Test Recall': test_recall,
    'Train F1': train_f1,
    'Test F1': test_f1
})

# Print metrics dataframe
print(metrics)

# Plot horizontal bar chart
metrics[['Model','Test F1','Test Accuracy']].sort_values('Test F1', ascending = False).plot(x ='Model', y=['Test F1', 'Test Accuracy'], kind = 'barh', title = 'Model Evaluation Metrics')
plt.xlabel('Accuracy')

plt.tight_layout()
plt.savefig('model_eval_plot.png')
plt.show()